{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes = pd.read_csv('cleaned_discharge_summaries.csv').dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "keys = ['train', 'dev', 'test']\n",
    "hadm_ids = {x:list(pd.read_csv('%s_50_hadm_ids.csv'%x, header=None)[0]) for x in keys}\n",
    "hadm_ids_total = hadm_ids['train'] + hadm_ids['dev'] + hadm_ids['test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes = codes[codes['HADM_ID'].isin(hadm_ids_total)].reset_index().drop(['index'], axis=1)\n",
    "assert len(codes) == len(hadm_ids_total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = {}\n",
    "labels = {}\n",
    "for k in keys :\n",
    "    code_filtered = codes[codes['HADM_ID'].isin(hadm_ids[k])]\n",
    "    texts[k] = list(code_filtered['TEXT'])\n",
    "    label = list(code_filtered['ICD9_CODE'])\n",
    "    label = [x.split(';') for x in label]\n",
    "    labels[k] = [1 if any(y.startswith('250.00') for y in x) else 0 for x in label]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_texts = []\n",
    "df_labels = []\n",
    "df_exp_split = []\n",
    "\n",
    "for k in keys :\n",
    "    df_texts += texts[k]\n",
    "    df_labels += labels[k]\n",
    "    df_exp_split += [k]*len(texts[k])\n",
    "    \n",
    "df = pd.DataFrame({'text' : df_texts, 'label' : df_labels, 'exp_split' : df_exp_split}) \n",
    "df.to_csv('mimic_diabetes_dataset.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary size :  18888\n",
      "Found 18884 words in model out of 18888\n"
     ]
    }
   ],
   "source": [
    "%run \"../preprocess_data_BC.py\" --data_file mimic_diabetes_dataset.csv --output_file ./vec_diabetes.p \\\n",
    "--word_vectors_type mimic --min_df 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
